import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from constants import INF, SPATIAL, MORPHOLOGICAL, TEMPORAL
PATH = 'clustersData/0'
df = None
files = os.listdir(PATH)
for file in sorted(files):
if df is None:
df = pd.read_csv(PATH + '/' + file)
else:
temp = pd.read_csv(PATH + '/' + file)
df = df.append(temp)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1063 entries, 0 to 0 Data columns (total 33 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 dep_red 1063 non-null float64 1 dep_sd 1063 non-null float64 2 hyp_red 1063 non-null float64 3 hyp_sd 1063 non-null float64 4 graph_avg_speed 1063 non-null float64 5 graph_slowest_path 1063 non-null float64 6 graph_fastest_path 1063 non-null float64 7 geometrical_avg_shift 1063 non-null float64 8 geometrical_shift_sd 1063 non-null float64 9 geometrical_max_dist 1063 non-null float64 10 spatial_dispersion_count 1063 non-null float64 11 spatial_dispersion_sd 1063 non-null float64 12 da 1063 non-null float64 13 da_sd 1063 non-null float64 14 Channels contrast 1063 non-null float64 15 break_measure 1063 non-null float64 16 fwhm 1063 non-null float64 17 get_acc 1063 non-null float64 18 max_speed 1063 non-null float64 19 peak2peak 1063 non-null float64 20 trough2peak 1063 non-null float64 21 rise_coef 1063 non-null float64 22 smile_cry 1063 non-null float64 23 d_kl 1063 non-null float64 24 jump 1063 non-null float64 25 psd_center 1063 non-null float64 26 der_psd_center 1063 non-null float64 27 rise_time 1063 non-null float64 28 unif_dist 1063 non-null float64 29 num_spikes 1063 non-null float64 30 max_abs 1063 non-null float64 31 name 1063 non-null object 32 label 1063 non-null float64 dtypes: float64(32), object(1) memory usage: 282.4+ KB
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
df.head()
| dep_red | dep_sd | hyp_red | hyp_sd | graph_avg_speed | graph_slowest_path | graph_fastest_path | geometrical_avg_shift | geometrical_shift_sd | geometrical_max_dist | spatial_dispersion_count | spatial_dispersion_sd | da | da_sd | Channels contrast | break_measure | fwhm | get_acc | max_speed | peak2peak | trough2peak | rise_coef | smile_cry | d_kl | jump | psd_center | der_psd_center | rise_time | unif_dist | num_spikes | max_abs | name | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.75 | 0.707107 | 2.75 | 0.707107 | 68.149943 | 10.965856 | 262.164746 | 0.211559 | 0.124836 | 29.826142 | 2.0 | 0.280689 | 936.0 | 1.466900 | 0.0 | -331.844968 | 36.0 | 14.250416 | 35.0 | 734.301418 | 80.0 | 59.0 | -11.477378 | 0.376905 | 502.096897 | 11.360348 | 6.023163 | 243.0 | -0.096432 | 55919.0 | 555.943579 | es04feb12_1_1_2 | -1.0 |
| 0 | 195.00 | 9.205976 | 195.00 | 9.205976 | 7.660231 | 2.444956 | 52.125445 | 0.404714 | 0.320027 | 58.920182 | 1.0 | 0.238903 | 290.0 | 0.849948 | 0.0 | -1008.448141 | 36.0 | 86.991241 | 37.0 | 1454.894065 | 56.0 | 43.0 | -13.845540 | 0.215786 | 3.901893 | 535.072365 | 514.048222 | 147.0 | 0.044684 | 1803.0 | 1154.678314 | es04feb12_1_2_10 | -1.0 |
| 0 | 272.00 | 13.301786 | 272.00 | 13.301786 | 13.562496 | 1.078448 | 117.613923 | 0.144514 | 0.081919 | 20.470874 | 1.0 | 0.294090 | 598.0 | 1.169164 | 0.0 | -1443.647863 | 51.0 | 59.464278 | 68.0 | 1627.022787 | 116.0 | 115.0 | -21.964323 | 0.192050 | 14.593452 | 158.464169 | 282.278652 | 101.0 | 0.162192 | 1937.0 | 979.782137 | es04feb12_1_2_11 | -1.0 |
| 0 | 28.00 | 4.358899 | 28.00 | 4.358899 | 28.967240 | 4.642857 | 145.746209 | 0.179741 | 0.084282 | 25.096477 | 1.0 | 0.283303 | 918.0 | 1.436056 | 0.0 | -751.235046 | 34.0 | 34.902512 | 28.0 | 916.768223 | 117.0 | 53.0 | -8.240933 | 0.148997 | 106.967168 | 86.035406 | 126.522112 | 91.0 | 0.164513 | 8214.0 | 658.024836 | es04feb12_1_2_12 | 1.0 |
| 0 | 43.75 | 6.123724 | 43.75 | 6.123724 | 16.491517 | 3.920049 | 63.582105 | 0.206621 | 0.073081 | 33.743488 | 1.0 | 0.298570 | 1028.0 | 1.450329 | 0.0 | -1854.952889 | 39.0 | 48.542376 | 64.0 | 2123.447813 | 116.0 | 116.0 | -14.980838 | 0.214417 | 23.869564 | 67.870324 | 105.331613 | 117.0 | 0.118233 | 3476.0 | 1333.102992 | es04feb12_1_2_13 | 1.0 |
df.describe()
| dep_red | dep_sd | hyp_red | hyp_sd | graph_avg_speed | graph_slowest_path | graph_fastest_path | geometrical_avg_shift | geometrical_shift_sd | geometrical_max_dist | spatial_dispersion_count | spatial_dispersion_sd | da | da_sd | Channels contrast | break_measure | fwhm | get_acc | max_speed | peak2peak | trough2peak | rise_coef | smile_cry | d_kl | jump | psd_center | der_psd_center | rise_time | unif_dist | num_spikes | max_abs | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1063.000000 | 1.063000e+03 | 1063.000000 | 1063.000000 |
| mean | 628.031162 | 9.764127 | 423.294920 | 8.735876 | 21.080586 | 6.883599 | 105.876222 | 0.378737 | 0.344269 | 56.418367 | 2.405456 | 0.304798 | 1012.539981 | 1.172401 | 4.297912 | -1584.402902 | 35.035748 | 41.226457 | 36.534337 | 1806.247124 | 101.688617 | 58.140169 | -12.634168 | 0.229378 | 239.226058 | 119.953114 | 112.025357 | 139.673565 | 0.079918 | 3.647276e+04 | 1258.553004 | -0.113829 |
| std | 2305.143697 | 14.913159 | 1499.272555 | 12.112884 | 14.168496 | 6.750139 | 59.281135 | 0.191845 | 0.530900 | 27.907055 | 1.319936 | 0.038801 | 529.668999 | 0.211951 | 34.494323 | 1126.937620 | 9.293126 | 437.239316 | 19.960566 | 1073.016626 | 28.731892 | 23.353746 | 12.949271 | 0.095259 | 179.428501 | 129.835298 | 133.004510 | 61.760409 | 0.111791 | 9.748708e+04 | 741.626245 | 0.956309 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.020859 | 0.018442 | 2.465271 | 1.000000 | 0.156309 | 124.000000 | 0.517692 | 0.000000 | -7511.872638 | 15.000000 | -9999.000000 | 7.000000 | 16.824444 | 10.000000 | 8.000000 | -84.177516 | 0.077713 | -117.249486 | 6.877197 | 3.472632 | 49.000000 | -0.361332 | 4.290000e+02 | 20.546620 | -3.000000 |
| 25% | 10.187500 | 2.546444 | 10.125000 | 2.541833 | 10.623957 | 2.411846 | 58.118350 | 0.237632 | 0.134977 | 35.784426 | 1.000000 | 0.281145 | 696.000000 | 1.058399 | 0.000000 | -2158.791261 | 30.000000 | 40.805642 | 22.000000 | 1102.444308 | 99.500000 | 42.000000 | -17.380732 | 0.167505 | 105.402888 | 41.656845 | 25.367231 | 94.000000 | 0.004028 | 4.128000e+03 | 790.127821 | -1.000000 |
| 50% | 47.000000 | 4.924429 | 46.375000 | 4.910130 | 17.609260 | 5.024938 | 96.902804 | 0.363474 | 0.234468 | 53.842292 | 2.000000 | 0.304111 | 902.000000 | 1.200967 | 0.000000 | -1346.885163 | 34.000000 | 57.415023 | 31.000000 | 1588.715455 | 117.000000 | 52.000000 | -12.210957 | 0.207251 | 217.134766 | 80.605792 | 65.570565 | 125.000000 | 0.095493 | 9.865000e+03 | 1097.891450 | -1.000000 |
| 75% | 189.812500 | 9.581068 | 187.687500 | 9.570051 | 28.048278 | 8.933223 | 143.574079 | 0.484546 | 0.356825 | 71.738375 | 3.000000 | 0.327343 | 1182.000000 | 1.315380 | 0.000000 | -804.605535 | 38.000000 | 79.180716 | 45.000000 | 2306.117919 | 117.000000 | 69.000000 | -7.959136 | 0.263534 | 339.933126 | 132.118259 | 137.636853 | 175.000000 | 0.162358 | 2.417300e+04 | 1602.155878 | 1.000000 |
| max | 34548.250000 | 114.713502 | 19143.750000 | 85.540561 | 83.777180 | 60.299254 | 366.748329 | 1.855837 | 8.452481 | 140.667142 | 8.000000 | 0.421263 | 3256.000000 | 1.666536 | 541.596859 | 476.111022 | 91.000000 | 196.104579 | 123.000000 | 6832.274229 | 203.000000 | 117.000000 | 107.086444 | 0.985809 | 743.055806 | 600.747082 | 602.508346 | 415.000000 | 0.298575 | 1.526168e+06 | 4953.776048 | 1.000000 |
df = df.loc[df.label >= 0]
df = df.loc[df.d_kl != -INF]
df.describe()
| dep_red | dep_sd | hyp_red | hyp_sd | graph_avg_speed | graph_slowest_path | graph_fastest_path | geometrical_avg_shift | geometrical_shift_sd | geometrical_max_dist | spatial_dispersion_count | spatial_dispersion_sd | da | da_sd | Channels contrast | break_measure | fwhm | get_acc | max_speed | peak2peak | trough2peak | rise_coef | smile_cry | d_kl | jump | psd_center | der_psd_center | rise_time | unif_dist | num_spikes | max_abs | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 530.000000 | 5.300000e+02 | 530.000000 | 530.000000 |
| mean | 361.081604 | 7.658351 | 250.441509 | 7.019369 | 22.048016 | 7.362424 | 109.435652 | 0.388699 | 0.328867 | 56.450173 | 2.341509 | 0.303413 | 932.535849 | 1.156497 | 1.981877 | -1791.602323 | 33.728302 | 46.631937 | 36.196226 | 1997.128521 | 101.256604 | 58.498113 | -13.660958 | 0.206866 | 271.730833 | 93.773239 | 83.975813 | 124.367925 | 0.107961 | 4.845652e+04 | 1384.816295 | 0.792453 |
| std | 1533.544371 | 11.362721 | 1001.498470 | 9.166846 | 13.019691 | 6.619013 | 59.355767 | 0.192545 | 0.510884 | 26.684352 | 1.275912 | 0.038972 | 456.345407 | 0.201883 | 11.178278 | 1142.791419 | 7.476602 | 437.948131 | 19.748302 | 1094.675370 | 29.712011 | 23.286353 | 10.798125 | 0.071769 | 175.055112 | 89.062560 | 96.581353 | 53.610463 | 0.098882 | 1.248243e+05 | 755.576197 | 0.405934 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.047929 | 0.018442 | 8.699463 | 1.000000 | 0.174545 | 130.000000 | 0.562012 | 0.000000 | -7511.872638 | 18.000000 | -9999.000000 | 10.000000 | 16.824444 | 10.000000 | 8.000000 | -77.572713 | 0.077713 | -71.559562 | 7.442545 | 3.534918 | 49.000000 | -0.254774 | 6.980000e+02 | 21.514729 | 0.000000 |
| 25% | 9.000000 | 2.384848 | 9.000000 | 2.372521 | 12.433902 | 2.690582 | 62.281733 | 0.237298 | 0.135322 | 37.151902 | 1.000000 | 0.277655 | 653.000000 | 1.039708 | 0.000000 | -2316.092753 | 30.000000 | 45.325465 | 22.000000 | 1264.411507 | 113.250000 | 41.000000 | -17.854637 | 0.150766 | 134.908088 | 38.019227 | 21.105335 | 80.000000 | 0.046329 | 6.363500e+03 | 862.782529 | 1.000000 |
| 50% | 34.312500 | 4.262842 | 34.125000 | 4.225932 | 19.786739 | 5.742786 | 101.575195 | 0.372739 | 0.240819 | 53.958976 | 2.000000 | 0.300317 | 856.000000 | 1.175606 | 0.000000 | -1550.039051 | 33.000000 | 62.376723 | 30.000000 | 1770.332448 | 117.000000 | 52.000000 | -12.447052 | 0.198755 | 248.929871 | 76.188401 | 57.051944 | 117.000000 | 0.113020 | 1.319900e+04 | 1197.972815 | 1.000000 |
| 75% | 130.500000 | 8.135570 | 127.750000 | 8.053518 | 28.961900 | 9.748466 | 146.573724 | 0.498573 | 0.361715 | 71.607345 | 3.000000 | 0.328254 | 1065.000000 | 1.287813 | 0.000000 | -1007.877874 | 37.000000 | 85.567910 | 44.750000 | 2517.085157 | 117.000000 | 68.000000 | -8.424648 | 0.235705 | 361.875435 | 118.221308 | 104.912136 | 153.000000 | 0.186693 | 2.929325e+04 | 1719.008969 | 1.000000 |
| max | 19719.250000 | 93.345795 | 16525.500000 | 78.887578 | 70.289427 | 50.717354 | 366.748329 | 1.640206 | 8.452481 | 140.667142 | 7.000000 | 0.421263 | 3256.000000 | 1.643654 | 145.254381 | 242.645403 | 85.000000 | 144.122825 | 114.000000 | 6832.274229 | 203.000000 | 117.000000 | 17.596043 | 0.681195 | 743.055806 | 555.472016 | 549.873485 | 361.000000 | 0.295213 | 1.526168e+06 | 4682.726671 | 1.000000 |
TARGET_COLUMN_NAME = 'label'
# Select all numerical features.
numerical_features = df.select_dtypes(["float64", "int64"])
plot_df = numerical_features.astype("float64") # this is done to solve a problem in sns (see https://datascience.stackexchange.com/questions/55435/seaborn-violin-plot-error-no-loop-for-unfunc-add)
# Create distribution plots.
nrows = len(numerical_features.columns)
fig, ax = plt.subplots(nrows=nrows, ncols=2, figsize=(20, 40))
for i, feature in enumerate(numerical_features):
sns.violinplot(x=TARGET_COLUMN_NAME, y=feature, data=plot_df, ax=ax[i, 0])
if i == 0:
ax[i, 0].set_title("Violin Plots")
ax[i, 1].set_title("Box Plots")
sns.boxplot(x=TARGET_COLUMN_NAME, y=feature, data=plot_df, ax=ax[i, 1])
ax[i, 0].set_xlabel("")
ax[i, 1].set_xlabel("")
ax[i, 1].set_ylabel("")
ax[i, 0].set_ylabel(feature, rotation=45, labelpad=50)
_ = fig.text(0.6, 0, "cell type", ha='center')
_ = fig.suptitle("Numerical Feature Distributions", y=1, x=0.6)
fig.tight_layout()
correlation_matrix = df.corr()
fig, ax = plt.subplots(figsize=(22, 22))
_ = sns.heatmap(correlation_matrix, annot=True, fmt='.2f')
sns.pairplot(df, hue="label")
<seaborn.axisgrid.PairGrid at 0x1fc1ce61280>